This R Markdown document reproduces the main analyses and figures for tumor evolution in high-clonality LUAD (Lung Adenocarcinoma) dataset from the Sherlock-Lung study
manuscript title: Deciphering lung adenocarcinoma evolution and the role of LINE-1 retrotransposition
Fig. 1: Evolutionary dynamics of lung cancer.
(You may need to install some packages if missing)
library(tidyverse)
library(ggplot2)
library(ggsankey)
library(scales)
library(hrbrthemes) # For theme_ipsum_rc
library(cowplot)
library(forcats)
library(ggrepel)
library(ggnewscale)
library(data.table)
library(ggasym)
library(ggpmisc)
library(broom)
library(ggsci)
# --- Load Input Datasets ------------------------------------------------------
# All data files should be placed in the appropriate subfolders.
# Example folder structure:
# ./data/ - for main input files
# ./functions/ - for custom R functions
# ./output/ - for saving plots and results
# Replace the filenames below with your actual dataset filenames.
load('./data/BBsolution_final3_short.RData', verbose = TRUE)
## Loading objects:
## BBsolution
## BBsolution4
## hq_samples
## wgs_groups_info
## wgs_groups_info2
load('./data/covdata0.RData', verbose = TRUE)
## Loading objects:
## covdata0
load('./data/clinical_data.RData', verbose = TRUE)
## Loading objects:
## clinical_data
load('./data/sherlock_data_all.RData', verbose = TRUE)
## Loading objects:
## sherlock_data
## sherlock_data_full
## sherlock_overall
## sherlock_type_colors
## sherlock_freq
## sherlock_freq_hq
load('./data/sherlock_variable.RData', verbose = TRUE)
## Loading objects:
## sherlock_variable
load('./data/sp_group_data.RData', verbose = TRUE)
## Loading objects:
## sp_group_color
## sp_group_color_new
## sp_group_data
## sp_group_data2
load('./data/id2data.RData', verbose = TRUE)
## Loading objects:
## id2data
## id2color
# Load additional analysis datasets
load('./data/Chronological_timing_short.RData', verbose = TRUE)
## Loading objects:
## MRCAdata
## subclonesTimeAbs
## subclonesTimeAbsLinear
## mrate
## rateDeam
## timingclass_groups
## timingInfo
load('./data/clsdata.RData', verbose = TRUE)
## Loading objects:
## clsdata
## clscolors
load('./data/ZTW_functions.RData', verbose = TRUE)
## Loading objects:
## barcode2spg
## sp_group_color
## sp_group_color_new
## sp_group_lift
## sp_group_data
## sp_group_major_new
## sp_group_major
load('./data/DP_info_data.RData', verbose = TRUE)
## Loading objects:
## DP_info_data
## Cluster_info
load('./data/covdata0.RData', verbose = TRUE)
## Loading objects:
## covdata0
load('./data/Signature_Lumidl.RData')
load('./data/Mutation_Signature_Probability_SBS.RData', verbose = TRUE)
## Loading objects:
## Mutation_Signature_Probability_SBS
load('./data/sherlock_maf.RData', verbose = TRUE)
## Loading objects:
## sherlock_maf
## sherlock_maf_freq
load('./data/sherlock_driver_mutations.RData', verbose = TRUE)
## Loading objects:
## sherlock_driver_mutations
load('./data/sherlock_mt.RData', verbose = TRUE)
## Loading objects:
## sherlock_mtvcf
## sherlock_mtbb
## hq_samples
load('./data/sigcol.RData', verbose = TRUE)
## Loading objects:
## sigcol
# --- Subset to LUAD high clonality data Only ------------------------------------------------------
# Filter to high-quality LUAD samples only
hq_samples2 <- covdata0 %>%
filter(Histology == 'Adenocarcinoma', Tumor_Barcode %in% hq_samples) %>%
pull(Tumor_Barcode)
rm(list = c('hq_samples'))
# Create a summary table of LUAD cohort and plot group composition
tdata0 <- covdata0 %>%
left_join(sp_group_data2) %>%
mutate(
Data = if_else(
Tumor_Barcode %in% hq_samples2,
'High-quality tumors',
'Other tumors'
)
) %>%
select(Tumor_Barcode, Data, Histology, SP_Group_New) %>%
filter(Histology == "Adenocarcinoma", Tumor_Barcode %in% hq_samples2)
# Sankey data preparation
tdata <- tdata0 %>% make_long(Data, SP_Group_New)
tdata_nr <- tdata %>%
filter(!is.na(node)) %>%
group_by(x, node) %>%
summarise(count = n())
tdata <- tdata %>% left_join(tdata_nr)
tdata$node <- factor(tdata$node, levels = c('High-quality tumors', 'AS_N', 'EU_N', 'EU_S', 'Others'))
# Sankey plot
ggplot(
tdata,
aes(
x = x, next_x = next_x,
node = node, next_node = next_node,
fill = factor(node), label = node
)
) +
geom_sankey(flow.alpha = 0.6, node.color = "gray30") +
geom_sankey_text(size = 4, color = "black") +
#geom_sankey_text(aes(label = count), size = 3.5, check_overlap = TRUE) +
scale_fill_manual(
values = c("#14315C","#E64B35FF", "#4DBBD5FF", "#00A087FF", "#3C5488FF" )
) +
theme_sankey(base_size = 18) +
labs(x = NULL) +
theme_ipsum_rc(axis = FALSE, axis_text_size = 16) +
theme(
axis.text.y = element_blank(),
panel.grid = element_blank(),
legend.position = "none"
)
# ggsave('./output/Sherlock_hq_LUAD_WGS_dataset.pdf', width = 7, height = 5, device = cairo_pdf)
# Plot WGD frequency in LUAD HQ samples by spatial group
tmp <- BBsolution4 %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
left_join(wgs_groups_info) %>%
count(SP_Group, WGD_Status) %>%
group_by(SP_Group) %>%
mutate(Freq = n / sum(n)) %>%
ungroup()
# Bar plot of WGD frequency
p2 <- tmp %>%
left_join(sp_group_data) %>%
ggplot(aes(
x = SP_Group_New,
y = n,
fill = SP_Group_New,
alpha = WGD_Status
)) +
geom_bar(stat = "identity", position = "fill", width = 0.8) +
geom_text(
aes(label = paste0(n, '\n', sprintf("%1.1f", Freq * 100), "%")),
position = position_fill(vjust = 0.5),
colour = "black",
size = 3.5
) +
scale_y_continuous(
breaks = pretty_breaks(),
labels = percent_format(),
expand = c(0, 0)
) +
scale_fill_manual(values = sp_group_color_new) +
scale_alpha_manual(values = c(0.5, 1)) +
theme_ipsum_rc(
axis_title_just = 'm',
axis_title_size = 16,
axis_text_size = 12,
grid = 'Yy',
ticks = FALSE
) +
labs(fill = "Group", x = "", y = 'Percentage') +
guides(fill = "none")
p2
#ggsave('./output/wgd_frequency_hq_luad.pdf', p2, width = 5, height = 6, device = cairo_pdf)
# Load additional data for Figure 1c
# Calculate ratio of mutations with copy number = 2
tmp <- Mutation_Signature_Probability_SBS %>%
left_join(DP_info_data %>% select(ID, no.chrs.bearing.mut)) %>%
filter(no.chrs.bearing.mut == 2) %>%
group_by(Tumor_Barcode) %>%
summarise(CNV2 = sum(SBS1 + SBS5, na.rm = TRUE))
tmp <- Mutation_Signature_Probability_SBS %>%
left_join(DP_info_data %>% select(ID, no.chrs.bearing.mut)) %>%
group_by(Tumor_Barcode) %>%
summarise(Total = sum(SBS1 + SBS5, na.rm = TRUE)) %>%
left_join(tmp)
tmp <- tmp %>% replace_na(list(CNV2 = 0))
tmp <- BBsolution4 %>%
select(Tumor_Barcode, WGD_Status) %>%
left_join(wgs_groups_info) %>%
left_join(tmp) %>%
mutate(Ratio = CNV2 / Total)
# Density plot for WGD tumors
p3 <- tmp %>%
filter(WGD_Status == 'WGD') %>%
left_join(sp_group_data) %>%
filter(SP_Group != "Others") %>%
ggplot(aes(Ratio, fill = SP_Group_New, group = SP_Group_New)) +
geom_density(alpha = 0.8) +
scale_x_continuous(
breaks = pretty_breaks(),
labels = percent_format(),
limits = c(0, 1)
) +
scale_y_continuous(breaks = pretty_breaks()) +
scale_fill_manual(values = sp_group_color_new, breaks = sp_group_major_new) +
theme_ipsum_rc(
base_size = 14,
axis_title_just = 'm',
axis_title_size = 16,
grid = FALSE,
ticks = TRUE
) +
panel_border(color = 'black') +
labs(fill = 'Group', x = '% mutations with copy number = 2', y = 'Density')
p3
#ggsave('./output/WGD_CNV2_all.pdf', p3, width = 6, height = 4, device = cairo_pdf)
# Calculate ratio of clonal [early]+[late] mutations
clonal_ratio <- clsdata %>%
filter(!is.na(CLS)) %>%
select(-Freq) %>%
pivot_wider(names_from = CLS, values_from = n) %>%
mutate(
Ratio = (`clonal [early]` + `clonal [late]`) /
(`clonal [early]` + `clonal [late]` + `clonal [NA]`)
) %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
summarise(Ratio = median(Ratio, na.rm = TRUE))
print(clonal_ratio)
## # A tibble: 1 × 1
## Ratio
## <dbl>
## 1 0.313
# Boxplot: Proportion of mutations by clonality and group
p4 <- clsdata %>%
left_join(wgs_groups_info) %>%
left_join(sp_group_data) %>%
filter(Tumor_Barcode %in% hq_samples2, !is.na(CLS)) %>%
ggplot(aes(CLS, Freq, fill = SP_Group_New)) +
geom_boxplot() +
labs(x = "", y = "Proportion of total mutations", fill = 'Group') +
scale_fill_manual(
values = sp_group_color_new,
breaks = names(sp_group_color_new)
) +
theme_ipsum_rc(
axis_text_size = 12,
axis_title_just = 'm',
axis_title_size = 14
) +
theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1)) +
panel_border(color = 'black')
p4
#ggsave('./output/CLS_SP_Group_hq_luad.pdf', p4, width = 6, height = 6, device = cairo_pdf)
For each cancer driver gene, a Fisher’s exact test was performed on a 2x2 contingency table (binary variables: smoking status and early clonal mutation status). The significance thresholds for P < 0.05 (green) and FDR < 0.05 (red), calculated using the Benjamini–Hochberg method, are indicated by dashed lines.
# Load required data (relative paths for reproducibility)
genelist <- readr::read_rds('./data/drivers_intogene.RDS') %>% pull(symbol)
# Join clonality info to driver mutations
clonality_info <- sherlock_mtvcf %>% select(Tumor_Barcode, MutationID, CLS)
driver_tbl <- sherlock_driver_mutations %>%
mutate(Chromosome = stringr::str_remove(Chromosome, "^chr")) %>%
mutate(MutationID = paste(Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":")) %>%
select(Tumor_Barcode, MutationID, Hugo_Symbol, Variant_Classification) %>%
left_join(clonality_info, by = c("Tumor_Barcode", "MutationID"))
# Restrict to genelist and high-quality LUAD samples
tmpdata <- driver_tbl %>%
left_join(wgs_groups_info %>% select(Tumor_Barcode, SP_Group), by = "Tumor_Barcode") %>%
filter(Hugo_Symbol %in% genelist, Tumor_Barcode %in% hq_samples2)
# Frequency by group, gene, clonality
freq_tbl <- tmpdata %>%
count(SP_Group, Hugo_Symbol, CLS) %>%
left_join(
wgs_groups_info %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
count(SP_Group, name = "size"),
by = "SP_Group"
) %>%
mutate(Freq = n / size) %>%
arrange(desc(Freq))
# Only genes with at least 8 total mutations
genelist_plot <- freq_tbl %>%
group_by(Hugo_Symbol) %>%
summarise(n = sum(n), .groups = "drop") %>%
arrange(desc(n)) %>%
filter(n > 8) %>%
pull(Hugo_Symbol)
# Barplot: stacked (counts)
p_e1 <- freq_tbl %>%
left_join(sp_group_data, by = "SP_Group") %>%
filter(Hugo_Symbol %in% genelist_plot) %>%
mutate(Hugo_Symbol = factor(Hugo_Symbol, levels = genelist_plot)) %>%
ggplot(aes(Hugo_Symbol, Freq, fill = CLS)) +
geom_bar(position = "stack", stat = "identity", size = 0.1) +
facet_wrap(~SP_Group_New, ncol = 1, scales = "free") +
scale_fill_manual(values = clscolors) +
scale_y_continuous(breaks = scales::pretty_breaks(), labels = scales::percent_format()) +
labs(x = "", y = "Driver mutation frequency\n", fill = "") +
theme_ipsum_rc(axis_text_size = 12, axis_title_just = "m", axis_title_size = 16) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), panel.spacing = unit(0.1, "lines"))
p_e1
#ggsave('./output/Driver_Gene_Clonality.pdf', p_e1, width = 9, height = 12, device = cairo_pdf)
# Barplot: proportions (normalized within gene)
p_e2 <- freq_tbl %>%
left_join(sp_group_data, by = "SP_Group") %>%
filter(Hugo_Symbol %in% genelist_plot) %>%
mutate(Hugo_Symbol = factor(Hugo_Symbol, levels = genelist_plot)) %>%
ggplot(aes(Hugo_Symbol, Freq, fill = CLS)) +
geom_bar(position = "fill", stat = "identity", size = 0.1) +
facet_wrap(~SP_Group_New, ncol = 1, scales = "free") +
scale_fill_manual(values = clscolors) +
scale_y_continuous(breaks = scales::pretty_breaks(), labels = scales::percent_format()) +
labs(x = "", y = "Proportion\n", fill = "") +
theme_ipsum_rc(axis_text_size = 12, axis_title_just = "m", axis_title_size = 16) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), panel.spacing = unit(0.1, "lines"))
p_e2
#ggsave('./output/Driver_Gene_Clonality2.pdf', p_e2, width = 9, height = 12, device = cairo_pdf)
# =================== Statistical Enrichment: Fisher's Exact Test ===================
# Example: N_U vs S_U for enrichment of 'clonal [early]' in each gene
# proportion of clonal mutations vs other mutation by fisher exact test
tmpdata %>%
filter(SP_Group %in% c('N_A', 'N_U')) %>%
mutate(CLS = if_else(CLS == 'clonal [early]', 'Yes', 'No')) %>%
select(Tumor_Barcode, Hugo_Symbol, CLS, SP_Group) %>%
# unique() %>%
# count(Tumor_Barcode,Hugo_Symbol,sort=T) %>%
mutate(CLS = as.factor(CLS), SP_Group = as.factor(SP_Group)) %>%
group_by(Hugo_Symbol) %>%
do(tidy(fisher.test(.$CLS, .$SP_Group))) %>%
arrange(p.value)
## # A tibble: 49 × 7
## # Groups: Hugo_Symbol [49]
## Hugo_Symbol estimate p.value conf.low conf.high method alternative
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ARID1A 0 0.0833 0 2.26 Fisher's Exact T… two.sided
## 2 SETD2 0 0.192 0 2.17 Fisher's Exact T… two.sided
## 3 EGFR 1.41 0.300 0.754 2.71 Fisher's Exact T… two.sided
## 4 RPL5 0 0.364 0 22.3 Fisher's Exact T… two.sided
## 5 APC 0 0.4 0 26.0 Fisher's Exact T… two.sided
## 6 NF1 0 0.4 0 26.0 Fisher's Exact T… two.sided
## 7 RB1 3.39 0.524 0.110 293. Fisher's Exact T… two.sided
## 8 KRAS Inf 0.526 0.205 Inf Fisher's Exact T… two.sided
## 9 PTEN 3.52 0.545 0.170 261. Fisher's Exact T… two.sided
## 10 PIK3CA 2.04 0.661 0.254 26.5 Fisher's Exact T… two.sided
## # ℹ 39 more rows
tmpdata %>%
filter(SP_Group %in% c('N_U', 'S_U')) %>%
mutate(CLS = if_else(CLS == 'clonal [early]', 'Yes', 'No')) %>%
select(Tumor_Barcode, Hugo_Symbol, CLS, SP_Group) %>%
# unique() %>%
# count(Tumor_Barcode,Hugo_Symbol,sort=T) %>%
mutate(CLS = as.factor(CLS), SP_Group = as.factor(SP_Group)) %>%
group_by(Hugo_Symbol) %>%
do(tidy(fisher.test(.$CLS, .$SP_Group))) %>%
arrange(p.value)
## # A tibble: 48 × 7
## # Groups: Hugo_Symbol [48]
## Hugo_Symbol estimate p.value conf.low conf.high method alternative
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 EGFR 0 0.000494 0 0.320 Fisher's Exact … two.sided
## 2 KEAP1 Inf 0.0433 0.780 Inf Fisher's Exact … two.sided
## 3 KRAS 2.70 0.108 0.827 9.34 Fisher's Exact … two.sided
## 4 ERBB2 0 0.286 0 15.6 Fisher's Exact … two.sided
## 5 MGA 5.76 0.294 0.410 353. Fisher's Exact … two.sided
## 6 NFE2L2 Inf 0.333 0.0513 Inf Fisher's Exact … two.sided
## 7 ZFHX3 0 0.333 0 19.5 Fisher's Exact … two.sided
## 8 SMARCA4 2.67 0.391 0.330 34.8 Fisher's Exact … two.sided
## 9 RPL5 Inf 0.417 0.0359 Inf Fisher's Exact … two.sided
## 10 SETD2 Inf 0.417 0.0359 Inf Fisher's Exact … two.sided
## # ℹ 38 more rows
tresult <- tmpdata %>%
filter(SP_Group != 'Others') %>%
mutate(SP_Group = if_else(SP_Group %in% c('N_U', 'N_A'), 'Yes', 'No')) %>%
mutate(CLS = if_else(CLS == 'clonal [early]', 'Yes', 'No')) %>%
select(Tumor_Barcode, Hugo_Symbol, CLS, SP_Group) %>%
# unique() %>%
# count(Tumor_Barcode,Hugo_Symbol,sort=T) %>%
mutate(CLS = as.factor(CLS), SP_Group = as.factor(SP_Group)) %>%
group_by(Hugo_Symbol) %>%
do(tidy(fisher.test(.$CLS, .$SP_Group))) %>%
ungroup() %>%
arrange(p.value) %>%
filter(Hugo_Symbol %in% genelist) %>%
mutate(FDR = p.adjust(p.value))
# Volcano plot
p_volcano <- tresult %>%
left_join(
freq_tbl %>%
group_by(Hugo_Symbol, CLS) %>%
summarise(n = sum(n)) %>%
mutate(Ratio = n / sum(n)) %>%
filter(CLS == 'clonal [early]') %>%
arrange(Ratio) %>%
ungroup() %>%
select(Hugo_Symbol, Ratio)
) %>%
ggplot(aes(log2(estimate), -log10(p.value))) +
geom_hline(yintercept = -log10(0.001725), linetype = 2, col = '#BB0E3D') +
geom_hline(yintercept = -log10(0.05), linetype = 2, col = "#4daf4a") +
geom_vline(xintercept = 0, col = '#cccccc', size = 0.25) +
geom_point(aes(size = Ratio, fill = estimate > 1), pch = 21, stroke = 0.5) + #result %>% arrange(FDR) %>% filter(FDR<0.05) %>% tail(n=1) %>% pull(p.value)aes(size=Freq),
#scale_shape_manual(values = c(21,22,24)) +
ggrepel::geom_text_repel(
data = tresult %>% filter(p.value < 0.05),
aes(label = Hugo_Symbol)
) +
theme_ipsum_rc(
base_size = 15,
axis_title_just = 'm',
axis_title_size = 16,
ticks = T
) +
theme(
legend.position = 'top',
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
legend.key.width = unit(0.7, 'cm')
) +
coord_cartesian(clip = 'off') +
#scale_fill_manual(values= ncicolpal)+
scale_fill_aaas() +
scale_x_continuous(breaks = pretty_breaks(n = 7), limits = c(-5.1, 5.1)) +
scale_y_continuous(breaks = pretty_breaks()) +
scale_size_binned(nice.breaks = T, n.breaks = 6) +
panel_border(color = 'black', size = 0.3) +
labs(
x = 'log2(Odds ratio)',
y = '-log10(P-value)',
fill = 'Enrichment',
size = 'Proportion'
) + #fill='Early clonal mutation enriched in',size='Proportion of early clonal mutation'
guides(fill = guide_legend(override.aes = list(size = 3.5)))
p_volcano
#ggsave('./output/Driver_Gene_EarlyClonal_Enrichment_volcano.pdf', p_volcano, width = 7, height = 6, device = cairo_pdf)
#
# # Load ASCETIC/ID2 group data and gene frequency data
# load('./data/ASCETIC_ID2_group_pattern.RData', verbose = TRUE)
# # Assume gene_freq is prepared as in your workflow, or recompute as needed.
# # For the following, gene_freq should have columns for each group, e.g. AS_N, EU_N, EU_S, Others
#
# # Example for AS_N group (repeat similar for other groups):
# datatmp <- df_asn %>% slice(1:5) %>% filter(from != 'ELF3')
# datatmp$from_rank = "1"
# datatmp$to_rank = "2"
# # Add wildtype for initial node
#
# # Reformat for plotting
# as_n_curve <- datatmp %>%
# select(gene = from, rank = from_rank) %>%
# left_join(gene_freq %>% select(gene, AS_N)) %>%
# mutate(from = 'WT', from_rank = "0") %>%
# select(from, to = gene, value = AS_N, from_rank, to_rank = rank) %>%
# bind_rows(datatmp)
#
# rankdata <- bind_rows(
# as_n_curve %>% select(gene = from, rank = from_rank),
# as_n_curve %>% select(gene = to, rank = to_rank)
# ) %>% unique() %>% mutate(rank = as.integer(rank))
#
# # Define x/y coordinates for the arc diagram
# max_gene_rank <- rankdata %>% count(rank, sort = TRUE) %>% slice(1) %>% pull(n)
# datay <- rankdata %>%
# left_join(rankdata %>% count(rank, sort = TRUE)) %>%
# left_join(gene_freq %>% select(gene, AS_N)) %>%
# group_by(rank) %>%
# arrange((AS_N)) %>%
# mutate(seq = seq_along(gene)) %>%
# ungroup() %>%
# mutate(y = seq * max_gene_rank / (n + 1)) %>%
# select(gene, y)
#
# datav <- left_join(rankdata %>% select(gene, x = rank), datay) %>%
# left_join(gene_freq %>% select(gene, freq = 'AS_N'))
#
# dataz <- as_n_curve %>%
# arrange(value) %>%
# left_join(datav %>% select(from = gene, x, y)) %>%
# left_join(datav %>% select(to = gene, x2 = x, y2 = y)) %>%
# mutate(x2 = 0.98 * x2, y2 = y2)
#
# # Arc diagram for AS_N group
# p7 <- datav %>%
# ggplot(aes(x, y)) +
# geom_segment(
# data = dataz,
# aes(x = x, y = y, xend = x2, yend = y2, size = value, col = value),
# arrow = arrow(length = unit(0.04, 'npc'))
# ) +
# geom_point(fill = 'gray', col = 'black', pch = 21, size = 6) +
# geom_text(aes(label = gene), nudge_y = -0.08) +
# scale_color_viridis_c(alpha = 0.9, direction = -1) +
# theme_ipsum_rc(grid = FALSE, axis = FALSE) +
# labs(col = 'Number of Sample\n') +
# guides(size = 'none') +
# theme(
# axis.text.x = element_blank(),
# axis.title.x = element_blank(),
# axis.text.y = element_blank(),
# axis.title.y = element_blank(),
# legend.position = 'top',
# legend.key.width = unit(1.2, 'cm'),
# legend.key.height = unit(0.4, 'cm')
# )
# ggsave('./output/AS_S_ascetic_infer_new.pdf', p7, width = 4.5, height = 4, device = cairo_pdf)
#
# # Repeat similar code for EU_N, EU_S, Others using their respective dataframes (df_eun, df_eus, df_others) and gene_freq columns (EU_N, EU_S, Others)
Fold changes between relative proportions of clonal and subclonal mutations attributed to individual mutational signatures. Each point represents a tumor sample, and points are colored by mutational signature. P-values from the Wilcoxon rank-sum test are displayed at the bottom of the boxplots.
apobec_ratio <- sherlock_variable %>%
#filter(str_starts(name,'SBS_')) %>%
#mutate(name=str_remove(name,'^ludmil_')) %>%
filter(name %in% c('SBS2_ratio', 'SBS13_ratio')) %>%
pivot_wider(names_from = 'name', values_from = value) %>%
mutate(APOBEC = SBS2_ratio + SBS13_ratio) %>%
select(Tumor_Barcode, APOBEC_Ratio = APOBEC)
# APOBEC + Clonality
# load('../DP_info_data.RData')
# sigprobality <- read_delim('/Volumes/data/NSLC2/Mutations/Signature_1217/result_spextractor_cosmic3.2/NSLC.SBS288.all/SBS288/Suggested_Solution/COSMIC_SBS288_Decomposed_Solution/Activities/Decomposed_Mutation_Probabilities.txt',delim = '\t',col_names = T)
# colnames(sigprobality)[1] <- 'Tumor_Barcode'
# sigprobality <- sigprobality %>% mutate(Tumor_Barcode=str_replace(Tumor_Barcode,"_TPD_01$","_TPD_01.01")) #%>% rename(Tumor_Barcode=`Sample Names`)
#
# apobec_clone_data <- DP_info_data %>%
# select(Tumor_Barcode,ID,MutationTypes=mutType,Clone) %>%
# left_join(
# sigprobality %>% mutate(APOBEC=SBS2+SBS12) %>% select(Tumor_Barcode,MutationTypes,APOBEC)
# )
apobec_clone_data <- Mutation_Signature_Probability_SBS %>%
mutate(APOBEC = SBS2 + SBS12) %>%
select(Tumor_Barcode, ID, MutationType, Clone, APOBEC)
apobec_clone_data <- apobec_clone_data %>%
group_by(Tumor_Barcode, Clone) %>%
summarise(APOBEC = sum(APOBEC, na.rm = T), Total = n_distinct(ID)) %>%
ungroup()
tmp <- apobec_clone_data %>%
select(-Total) %>%
pivot_wider(names_from = 'Clone', values_from = 'APOBEC') %>%
replace_na(replace = list(`N` = 0, Y = 0, `NA` = 0)) %>%
mutate(
Subclone_Ratio = N / (N + Y),
Enrichment = N / Y,
total = N + Y + `NA`
) %>%
filter(total > 100) %>%
left_join(wgs_groups_info %>% select(Tumor_Barcode, SP_Group)) %>%
#filter(SP_Group!="Others") %>%
left_join(apobec_ratio)
# hq sample set
tmp <- tmp %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
left_join(sp_group_data)
# tmp %>%
# select(SP_Group,Tumor_Barcode,`APOBEC Enrichment (Subclonal/Clonal)`=Enrichment,`APOBEC subclonal_Ratio`=Subclone_Ratio) %>%
# pivot_longer(cols = -c(SP_Group,Tumor_Barcode)) %>%
# ggplot(aes(value,col=SP_Group))+
# geom_density(size=1)+
# facet_wrap(~name,scales = "free")+
# scale_x_continuous(breaks = pretty_breaks(),labels = label_comma())+
# scale_y_continuous(breaks = pretty_breaks(),labels = label_comma())+
# theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
# labs(x="\nAPOBEC subclonal mutation ratio", y = "Density") +
# panel_border(color = 'black')
#
# ggsave(filename = 'APOBEC_Subclonality.pdf',width = 9,height = 5.5,device = cairo_pdf)
tmp %>%
#filter(Enrichment>0) %>%
#mutate(Enrichment=if_else(Enrichment>5,5,Enrichment)) %>%
ggplot(aes(SP_Group_New, log2(Enrichment + 1), fill = (APOBEC_Ratio))) +
#annotation_logticks(sides="l")+
geom_jitter(
position = position_jitter(w = 0.2, h = 0),
size = 2.5,
shape = 21,
col = "white",
stroke = 0.2
) +
geom_boxplot(fill = NA, outlier.shape = NA, width = 0.6) +
scale_y_continuous(
breaks = pretty_breaks(),
expand = expansion(mult = c(0.02, 0.02))
) +
labs(
x = "",
y = "log2(FC +1), FC= APOBEC Subclonal/APOBEC Clonal",
fill = "APOBEC Ratio"
) +
theme_ipsum_rc(
base_size = 14,
axis_title_size = 14,
axis_title_just = 'm',
grid = "Y",
ticks = TRUE,
axis = ""
) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
panel_border(size = 0.5, color = "black") +
scale_fill_viridis_c() +
geom_hline(yintercept = 1, linetype = 2)
#ggsave(filename = 'APOBEC_Subclonality1.pdf',width = 6,height = 9,device = cairo_pdf)
# ggsave(
# filename = 'APOBEC_Subclonality1_hq_ID2paper_tmp.pdf',
# width = 6,
# height = 9,
# device = cairo_pdf
# )
tmp %>%
filter(N > 0) %>%
#filter(Enrichment>0) %>%
#mutate(Enrichment=if_else(Enrichment>5,5,Enrichment)) %>%
ggplot(aes(SP_Group_New, log2(Enrichment), fill = (APOBEC_Ratio))) +
#annotation_logticks(sides="l")+
geom_jitter(
position = position_jitter(w = 0.15, h = 0.05),
size = 2.5,
shape = 21,
col = "white",
stroke = 0.2
) +
geom_boxplot(fill = NA, outlier.shape = NA, width = 0.6) +
scale_y_continuous(
breaks = pretty_breaks(),
expand = expansion(mult = c(0.02, 0.02))
) +
labs(
x = "",
y = "log2(FC), FC=APOBEC Subclonal/APOBEC Clonal",
fill = "APOBEC Ratio"
) +
theme_ipsum_rc(
base_size = 14,
axis_title_size = 14,
axis_title_just = 'm',
grid = "Y",
ticks = TRUE,
axis = ""
) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
panel_border(size = 0.5, color = "black") +
scale_fill_viridis_c() +
geom_hline(yintercept = 0, linetype = 2)
#ggsave(filename = 'APOBEC_Subclonality2.pdf',width = 6,height = 9,device = cairo_pdf)
# ggsave(
# filename = 'APOBEC_Subclonality2_hq_ID2paper_tmp.pdf',
# width = 6,
# height = 9,
# device = cairo_pdf
# )
# Subclonality by the Ratio
## add signature ratio
mutation_ratio <- sherlock_variable %>%
# filter(str_starts(name,'ludmil_')) %>%
# mutate(name=str_remove(name,'^ludmil_')) %>%
filter(str_starts(name, 'SBS'), str_ends(name, 'ratio')) %>%
pivot_wider(names_from = 'name', values_from = value) %>%
dplyr::rename(APOBEC_ratio = SBS_APOBEC_ratio) %>%
pivot_longer(-Tumor_Barcode) %>%
mutate(name = str_remove(name, '_ratio')) %>%
dplyr::rename(Mutation_Ratio = value)
tmp <- Mutation_Signature_Probability_SBS %>%
mutate(SBS_APOBEC = SBS2 + SBS12) %>%
select(Tumor_Barcode, ID, MutationType, Clone, starts_with('SBS'))
# tmp <- DP_info_data %>%
# select(Tumor_Barcode,ID,MutationTypes=mutType,Clone) %>%
# left_join(
# sigprobality %>% mutate(SBS_APOBEC=SBS2+SBS12) %>% select(Tumor_Barcode,MutationTypes,contains('SBS'))
# )
tmp <- tmp %>%
filter(!is.na(Clone)) %>%
group_by(Tumor_Barcode, Clone) %>%
summarise_at(vars(contains('SBS')), sum, na.rm = TRUE) %>%
ungroup()
tmp <- tmp %>% pivot_longer(cols = -c(Tumor_Barcode, Clone))
## clone vs suclone mutational signature
#excludeSBS <- c('SBS7a','SBS7b','SBS7c','SBS7d','SBS17a','SBS17b','SBS28','SBS44','SBS39','SBS21')
excludeSBS <- c('SBS21', 'SBS44')
tmp %>%
group_by(name) %>%
do(tidy(wilcox.test(value ~ Clone, data = .))) %>%
ungroup() %>%
arrange(p.value) %>%
mutate(FDR = p.adjust(p.value, method = 'BH'))
## # A tibble: 20 × 6
## name statistic p.value method alternative FDR
## <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 SBS5 107966 1.04e-157 Wilcoxon rank sum test … two.sided 2.08e-156
## 2 SBS1 174607 2.30e- 97 Wilcoxon rank sum test … two.sided 2.30e- 96
## 3 SBS40a 295588 1.37e- 26 Wilcoxon rank sum test … two.sided 9.11e- 26
## 4 SBS4 356762 1.13e- 10 Wilcoxon rank sum test … two.sided 5.65e- 10
## 5 SBS_APOBEC 349921 1.98e- 9 Wilcoxon rank sum test … two.sided 7.93e- 9
## 6 SBS100 373342 6.41e- 8 Wilcoxon rank sum test … two.sided 2.14e- 7
## 7 SBS13 367410 3.96e- 6 Wilcoxon rank sum test … two.sided 1.13e- 5
## 8 SBS2 370846 1.72e- 5 Wilcoxon rank sum test … two.sided 4.29e- 5
## 9 SBS92 408806 1.30e- 1 Wilcoxon rank sum test … two.sided 2.89e- 1
## 10 SBS3 409766. 2.10e- 1 Wilcoxon rank sum test … two.sided 4.20e- 1
## 11 SBS8 418740. 5.29e- 1 Wilcoxon rank sum test … two.sided 8.48e- 1
## 12 SBS22a 416529 5.53e- 1 Wilcoxon rank sum test … two.sided 8.48e- 1
## 13 SBS39 415458. 6.66e- 1 Wilcoxon rank sum test … two.sided 8.48e- 1
## 14 SBS21 414657 6.77e- 1 Wilcoxon rank sum test … two.sided 8.48e- 1
## 15 SBS44 414657 6.77e- 1 Wilcoxon rank sum test … two.sided 8.48e- 1
## 16 SBS33 414656 6.78e- 1 Wilcoxon rank sum test … two.sided 8.48e- 1
## 17 SBS12 416342 7.71e- 1 Wilcoxon rank sum test … two.sided 9.07e- 1
## 18 SBS17a 414969 8.33e- 1 Wilcoxon rank sum test … two.sided 9.25e- 1
## 19 SBS17b 414869 8.92e- 1 Wilcoxon rank sum test … two.sided 9.39e- 1
## 20 SBS18 414484. 9.89e- 1 Wilcoxon rank sum test … two.sided 9.89e- 1
tmp <- tmp %>% mutate(name = str_remove(name, 'SBS_'))
tmp0 <- tmp
tmp <- tmp %>% filter(!(name %in% excludeSBS))
## for the high quality data
tmp <- tmp %>% filter(Tumor_Barcode %in% hq_samples2)
# tmp2 <- tmp %>% group_by(Tumor_Barcode,Clone) %>% summarise(total=sum(value))
tmp2 <- tmp %>%
select(Tumor_Barcode, Clone, name, value) %>%
group_by(Tumor_Barcode, Clone) %>%
summarise(total = sum(value)) %>%
ungroup() %>%
pivot_wider(
names_from = "Clone",
values_from = "total",
values_fill = list(value = 0)
) %>%
dplyr::rename(Ytotal = Y, Ntotal = N)
tmp3 <- tmp %>%
select(Tumor_Barcode, Clone, name, value) %>%
pivot_wider(
names_from = "Clone",
values_from = "value",
values_fill = list(value = 0)
) %>%
left_join(tmp2) %>%
left_join(mutation_ratio) %>%
# left_join(sherlock_sis_numbers %>% select(Tumor_Barcode,SNV_Num)) %>%
filter(Ytotal > 100, Ntotal > 100, Y > 20, N > 20, Mutation_Ratio > 0.02) %>%
mutate(
Nratio = (N / Ntotal),
Yratio = (Y / Ytotal),
Fold = Nratio / Yratio
) %>%
mutate(name = fct_reorder(name, Fold)) %>%
filter(name != 'APOBEC')
tmpvalue <- tmp3 %>%
group_by(name) %>%
do(tidy(wilcox.test(.$Nratio, .$Yratio))) %>%
arrange(p.value) %>%
ungroup()
source(
"./functions/Sigvisualfunc.R"
)
SBScolor <- sigcol
# SBScolor['SBS36'] <- 'yellow4'
# SBScolor['APOBEC'] <- "#f781bf"
# SBScolor['SBS-Novel-A'] <- "#984ea3"
# SBScolor['SBS-Novel-C'] <- "#3D4551"
myggstyle()
library(ggnewscale)
tmp3 %>%
#mutate(Fold=if_else(Fold>4,4,Fold)) %>%
ggplot(aes(name, Fold, fill = name)) +
geom_hline(yintercept = 1, linetype = 2) +
#scale_y_continuous(trans = 'log10',limits = c(0.3,5),expand = c(0,0))+
#annotation_logticks(sides="l")+
geom_jitter(
aes(size = Mutation_Ratio),
position = position_jitter(w = 0.15, h = 0.01),
shape = 21,
col = "gray50",
stroke = 0.4
) +
geom_boxplot(fill = NA, outlier.shape = NA, width = 0.6) +
#scale_y_continuous(breaks = pretty_breaks(),expand = expansion(mult = c(0.02,0.02)))+
scale_y_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = label_log(),
limits = c(0.1, 12)
) +
annotation_logticks(sides = "l", outside = T) +
labs(x = "", y = "Fold Change (Subclonal/Clonal)") +
theme_ipsum_rc(
base_size = 14,
axis_title_size = 14,
axis_title_just = 'm',
grid = "Y",
ticks = TRUE,
axis = ""
) +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
panel_border(size = 0.5, color = "black") +
scale_fill_manual(
"Signatures",
values = SBScolor,
breaks = levels(tmp3$name)
) +
scale_size_binned(n.breaks = 6, range = c(1, 5)) +
guides(
fill = "none",
size = guide_legend(override.aes = list(fill = 'black'))
) + #guide_legend(override.aes = list(size = 3))
ggnewscale::new_scale_fill() +
geom_tile(
data = tmpvalue %>%
mutate(Fold = 0.12, p.value = if_else(p.value > 0.05, NA_real_, p.value)),
aes(fill = -log10(p.value)),
height = .05
) +
scale_fill_viridis_c() +
coord_cartesian(clip = "off")
#ggsave(filename = 'SBS_enrich_subclone.pdf',width = 10,height = 7,device = cairo_pdf)
# ggsave(
# filename = 'SBS_enrich_subclone_hq_ID2paper_tmp.pdf',
# width = 9,
# height = 8,
# device = cairo_pdf
# )
tmp3 %>%
left_join(wgs_groups_info %>% select(Tumor_Barcode, SP_Group)) %>%
#filter(SP_Group != "Others") %>%
left_join(sp_group_data2) %>%
#mutate(Fold=if_else(Fold>4,4,Fold)) %>%
ggplot(aes(name, log2(Fold + 1), fill = name)) +
geom_hline(yintercept = 1, linetype = 2) +
#scale_y_continuous(trans = 'log10',limits = c(0.3,5),expand = c(0,0))+
#annotation_logticks(sides="l")+
geom_jitter(
position = position_jitter(w = 0.15, h = 0.01),
size = 2,
shape = 21,
col = "#bbbbbb",
stroke = 0.2
) +
geom_boxplot(fill = NA, outlier.shape = NA, width = 0.6) +
facet_wrap(~SP_Group_New, scales = 'free_y', ncol = 1) +
scale_y_continuous(
breaks = pretty_breaks(),
expand = expansion(mult = c(0.02, 0.02))
) +
labs(x = "", y = "Fold Change (Subclonal/Clonal)") +
theme_ipsum_rc(
base_size = 14,
axis_title_size = 14,
axis_title_just = 'm',
grid = "Y",
ticks = TRUE,
axis = ""
) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
panel_border(size = 0.5, color = "black") +
scale_fill_manual("Signatures", values = SBScolor, breaks = levels(tmp3$name))
#ggsave(filename = 'SBS_enrich_subclone_group.pdf',width = 18,height = 7,device = cairo_pdf)
# ggsave(
# filename = 'SBS_enrich_subclone_group_hq_ID2paper.pdf',
# width = 9,
# height = 12,
# device = cairo_pdf
# )
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dunnr_0.2.6 ggpubr_0.6.1 janitor_2.2.1 ggforce_0.5.0
## [5] ggtext_0.1.2 ggsci_3.2.0 broom_1.0.8 ggpmisc_0.6.2
## [9] ggpp_0.5.9 ggasym_0.1.6 data.table_1.17.8 ggnewscale_0.5.2
## [13] ggrepel_0.9.6 cowplot_1.2.0 hrbrthemes_0.8.7 scales_1.4.0
## [17] ggsankey_0.0.99999 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [21] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5 tidyr_1.3.1
## [25] tibble_3.3.0 ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 farver_2.1.2
## [4] fastmap_1.2.0 tweenr_2.0.3 fontquiver_0.2.1
## [7] digest_0.6.37 timechange_0.3.0 lifecycle_1.0.4
## [10] survival_3.8-3 magrittr_2.0.3 compiler_4.3.2
## [13] rlang_1.1.6 sass_0.4.10 tools_4.3.2
## [16] utf8_1.2.6 yaml_2.3.10 knitr_1.50
## [19] ggsignif_0.6.4 labeling_0.4.3 xml2_1.3.8
## [22] RColorBrewer_1.1-3 abind_1.4-8 withr_3.0.2
## [25] grid_4.3.2 polyclip_1.10-7 gdtools_0.4.2
## [28] colorspace_2.1-1 extrafontdb_1.0 MASS_7.3-60.0.1
## [31] dichromat_2.0-0.1 cli_3.6.5 rmarkdown_2.29
## [34] generics_0.1.4 rstudioapi_0.17.1 tzdb_0.5.0
## [37] polynom_1.4-1 cachem_1.1.0 splines_4.3.2
## [40] distill_1.6 vctrs_0.6.5 Matrix_1.6-5
## [43] jsonlite_2.0.0 fontBitstreamVera_0.1.1 SparseM_1.84-2
## [46] carData_3.0-5 car_3.1-3 hms_1.1.3
## [49] rstatix_0.7.2 Formula_1.2-5 systemfonts_1.2.3
## [52] jquerylib_0.1.4 glue_1.8.0 stringi_1.8.7
## [55] gtable_0.3.6 downlit_0.4.4 extrafont_0.19
## [58] pillar_1.11.0 htmltools_0.5.8.1 quantreg_6.1
## [61] R6_2.6.1 evaluate_1.0.4 lattice_0.22-7
## [64] backports_1.5.0 gridtext_0.1.5 memoise_2.0.1
## [67] snakecase_0.11.1 fontLiberation_0.1.0 bslib_0.9.0
## [70] MatrixModels_0.5-4 Rcpp_1.1.0 Rttf2pt1_1.3.12
## [73] xfun_0.52 fs_1.6.6 usethis_3.1.0
## [76] pkgconfig_2.0.3